import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from retrivolve.data_collector.crawler import Crawler
from retrivolve.data_collector.web_cleaner import WebCleaner


if __name__ == "__main__":
    crawler = Crawler()
    url = "https://arxiv.org/"
    content = crawler.crawl(url)
    if content:
        print("Crawled content successfully.")
        web_cleaner = WebCleaner()
        text = web_cleaner.clean_html(content)
        links = web_cleaner.extract_links(content)
        print(text)
        print('=======')
        print(links)
    else:
        print("Failed to crawl content.")